import requests, re, os, time
from lxml import etree
import pandas as pd


def func(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    res.encoding = 'utf8'
    html = res.text
    xml = etree.HTML(html)
    ls = xml.xpath('//div[@id="bdxs_result_lists"]/div')
    for i in ls[1:]:
        data = {}
        data['年份'] = re.sub(r'\s', '', ''.join(i.xpath('./div[1]/div[@class="sc_info"]/span[@class="sc_time"]/text()')))
        data['标题'] = re.sub(r'\n', '', ' '.join(i.xpath('./div[1]/h3/a//text()')))
        href = i.xpath('./div[1]/h3/a/@href')[0]
        response = requests.get(href, headers=headers)
        response.encoding = 'utf8'
        h = response.text
        x = etree.HTML(h)
        key = x.xpath('//p[@class="kw_main"]/span/a/text()')
        data['关键词'] = key
        print(data)
        if data:
            a.append(data)


if __name__ == '__main__':
    a = []
    flag = 0
    for page in range(1, 161):
        if page == 1:
            url = 'http://xueshu.baidu.com/s?wd=oocyte&tn=SE_baiduxueshu_c1gjeupa&sc_hit=1&sort=sc_time&bcp=2&ie=utf-8&filter=sc_year%3D%7B2009%2C2019%7D%28sc_level%3A%3D%7B1%7D%29'
        else:
            url = f'http://xueshu.baidu.com/s?wd=oocyte&pn={page*10}&tn=SE_baiduxueshu_c1gjeupa&ie=utf-8&sort=sc_time&filter=sc_year%3D%7B2009%2C2019%7D%28sc_level%3A%3D%7B1%7D%29&bcp=2&sc_hit=1'
        func(url)
        print(page, '页已爬完，数据量=', len(a))
        time.sleep(.5)
        if flag:
            break
    pd.DataFrame(a).to_excel('信息.xlsx', index=False)
